{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "28JCZXo2qPQe"
   },
   "outputs": [],
   "source": [
    "from __future__ import absolute_import, division, print_function, unicode_literals\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import tensorflow as tf\n",
    "\n",
    "from tensorflow import feature_column\n",
    "from tensorflow.keras import layers\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import LabelEncoder,OneHotEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 226
    },
    "colab_type": "code",
    "id": "WfFAIHKS2xxO",
    "outputId": "cd69745c-e55f-4552-ac61-305069212872"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>duration</th>\n",
       "      <th>protocol_type</th>\n",
       "      <th>service</th>\n",
       "      <th>flag</th>\n",
       "      <th>src_bytes</th>\n",
       "      <th>dst_bytes</th>\n",
       "      <th>land</th>\n",
       "      <th>wrong_fragment</th>\n",
       "      <th>urgent</th>\n",
       "      <th>hot</th>\n",
       "      <th>...</th>\n",
       "      <th>dst_host_srv_count</th>\n",
       "      <th>dst_host_same_srv_rate</th>\n",
       "      <th>dst_host_diff_srv_rate</th>\n",
       "      <th>dst_host_same_src_port_rate</th>\n",
       "      <th>dst_host_srv_diff_host_rate</th>\n",
       "      <th>dst_host_serror_rate</th>\n",
       "      <th>dst_host_srv_serror_rate</th>\n",
       "      <th>dst_host_rerror_rate</th>\n",
       "      <th>dst_host_srv_rerror_rate</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>181</td>\n",
       "      <td>5450</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>9</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>normal.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>239</td>\n",
       "      <td>486</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>19</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>normal.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>235</td>\n",
       "      <td>1337</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>29</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.03</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>normal.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>219</td>\n",
       "      <td>1337</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>39</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.03</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>normal.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>tcp</td>\n",
       "      <td>http</td>\n",
       "      <td>SF</td>\n",
       "      <td>217</td>\n",
       "      <td>2032</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>49</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.02</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>normal.</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 42 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   duration protocol_type service flag  src_bytes  dst_bytes  land  \\\n",
       "0         0           tcp    http   SF        181       5450     0   \n",
       "1         0           tcp    http   SF        239        486     0   \n",
       "2         0           tcp    http   SF        235       1337     0   \n",
       "3         0           tcp    http   SF        219       1337     0   \n",
       "4         0           tcp    http   SF        217       2032     0   \n",
       "\n",
       "   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \\\n",
       "0               0       0    0  ...                   9   \n",
       "1               0       0    0  ...                  19   \n",
       "2               0       0    0  ...                  29   \n",
       "3               0       0    0  ...                  39   \n",
       "4               0       0    0  ...                  49   \n",
       "\n",
       "   dst_host_same_srv_rate  dst_host_diff_srv_rate  \\\n",
       "0                     1.0                     0.0   \n",
       "1                     1.0                     0.0   \n",
       "2                     1.0                     0.0   \n",
       "3                     1.0                     0.0   \n",
       "4                     1.0                     0.0   \n",
       "\n",
       "   dst_host_same_src_port_rate  dst_host_srv_diff_host_rate  \\\n",
       "0                         0.11                          0.0   \n",
       "1                         0.05                          0.0   \n",
       "2                         0.03                          0.0   \n",
       "3                         0.03                          0.0   \n",
       "4                         0.02                          0.0   \n",
       "\n",
       "   dst_host_serror_rate  dst_host_srv_serror_rate  dst_host_rerror_rate  \\\n",
       "0                   0.0                       0.0                   0.0   \n",
       "1                   0.0                       0.0                   0.0   \n",
       "2                   0.0                       0.0                   0.0   \n",
       "3                   0.0                       0.0                   0.0   \n",
       "4                   0.0                       0.0                   0.0   \n",
       "\n",
       "   dst_host_srv_rerror_rate    label  \n",
       "0                       0.0  normal.  \n",
       "1                       0.0  normal.  \n",
       "2                       0.0  normal.  \n",
       "3                       0.0  normal.  \n",
       "4                       0.0  normal.  \n",
       "\n",
       "[5 rows x 42 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd \n",
    "\n",
    "# Feature names from the file kddcup.names file to be used as cols heading\n",
    "col_names = [\"duration\",\"protocol_type\",\"service\",\"flag\",\"src_bytes\",\n",
    "    \"dst_bytes\",\"land\",\"wrong_fragment\",\"urgent\",\"hot\",\"num_failed_logins\",\n",
    "    \"logged_in\",\"num_compromised\",\"root_shell\",\"su_attempted\",\"num_root\",\n",
    "    \"num_file_creations\",\"num_shells\",\"num_access_files\",\"num_outbound_cmds\",\n",
    "    \"is_host_login\",\"is_guest_login\",\"count\",\"srv_count\",\"serror_rate\",\n",
    "    \"srv_serror_rate\",\"rerror_rate\",\"srv_rerror_rate\",\"same_srv_rate\",\n",
    "    \"diff_srv_rate\",\"srv_diff_host_rate\",\"dst_host_count\",\"dst_host_srv_count\",\n",
    "    \"dst_host_same_srv_rate\",\"dst_host_diff_srv_rate\",\"dst_host_same_src_port_rate\",\n",
    "    \"dst_host_srv_diff_host_rate\",\"dst_host_serror_rate\",\"dst_host_srv_serror_rate\",\n",
    "    \"dst_host_rerror_rate\",\"dst_host_srv_rerror_rate\",\"label\"]\n",
    "\n",
    "df = pd.read_csv(\"../kddcup.data_10_percent_corrected\", header=None, names = col_names)\n",
    "\n",
    "# df=pd.read_csv('/content/gdrive/My Drive/data/kddcup.data_10_percent_corrected')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "9yZoa8rvLuEa"
   },
   "outputs": [],
   "source": [
    "## Data Preperation for training\n",
    "# ------------------------------\n",
    "def prepare_data(df):\n",
    "  '''This function prepares the dataset for training. \n",
    "  All categorical data will be encoded using LabelEncoder() of Sklearn. \n",
    "  Labels of each sample will be encoded into five categories as follows: -\n",
    "  \n",
    "    0 - Normal connection\n",
    "    1 - dos attack\n",
    "    2 - probe attack\n",
    "    3 - r2l attack\n",
    "    4 - u2r attack\n",
    "  '''\n",
    "  # Encoding the categorical label to five categories:\n",
    "  newlabeldf=df['label'].replace({ 'normal.' : 0, 'neptune.' : 1 ,'back.': 1, 'land.': 1, 'pod.': 1, 'smurf.': 1, 'teardrop.': 1,'mailbomb.': 1, 'apache2.': 1, 'processtable.': 1, 'udpstorm.': 1, 'worm.': 1,\n",
    "                           'ipsweep.' : 2,'nmap.' : 2,'portsweep.' : 2,'satan.' : 2,'mscan.' : 2,'saint.' : 2\n",
    "                           ,'ftp_write.': 3,'guess_passwd.': 3,'imap.': 3,'multihop.': 3,'phf.': 3,'spy.': 3,'warezclient.': 3,'warezmaster.': 3,'sendmail.': 3,'named.': 3,'snmpgetattack.': 3,'snmpguess.': 3,'xlock.': 3,'xsnoop.': 3,'httptunnel.': 3,\n",
    "                           'buffer_overflow.': 4,'loadmodule.': 4,'perl.': 4,'rootkit.': 4,'ps.': 4,'sqlattack.': 4,'xterm.': 4})\n",
    "  df['label'] = newlabeldf.astype('int')\n",
    "  \n",
    "  # Encoding categorical data using LabelEncoder()\n",
    "  le = LabelEncoder()\n",
    "  df['protocol_type'] = le.fit_transform(df['protocol_type'])\n",
    "  df['service']= le.fit_transform(df['service'])\n",
    "  df['flag'] = le.fit_transform(df['flag'])\n",
    "  \n",
    "  return df\n",
    "  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "grxlSUbpp7R1"
   },
   "outputs": [],
   "source": [
    "# X = df.iloc[:,:41]\n",
    "# y = df.iloc[:,-1].astype('int')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "wwEJLsMBTHVM"
   },
   "outputs": [],
   "source": [
    "df = prepare_data(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "l0uviy0bMb0o"
   },
   "source": [
    "## Visualizing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 226
    },
    "colab_type": "code",
    "id": "WaoDh6iTL4f3",
    "outputId": "7bb98647-1695-40cb-d963-1e92a624f194"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>duration</th>\n",
       "      <th>protocol_type</th>\n",
       "      <th>service</th>\n",
       "      <th>flag</th>\n",
       "      <th>src_bytes</th>\n",
       "      <th>dst_bytes</th>\n",
       "      <th>land</th>\n",
       "      <th>wrong_fragment</th>\n",
       "      <th>urgent</th>\n",
       "      <th>hot</th>\n",
       "      <th>...</th>\n",
       "      <th>dst_host_count</th>\n",
       "      <th>dst_host_srv_count</th>\n",
       "      <th>dst_host_same_srv_rate</th>\n",
       "      <th>dst_host_diff_srv_rate</th>\n",
       "      <th>dst_host_same_src_port_rate</th>\n",
       "      <th>dst_host_srv_diff_host_rate</th>\n",
       "      <th>dst_host_serror_rate</th>\n",
       "      <th>dst_host_srv_serror_rate</th>\n",
       "      <th>dst_host_rerror_rate</th>\n",
       "      <th>dst_host_srv_rerror_rate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>-0.067792</td>\n",
       "      <td>0.925753</td>\n",
       "      <td>-0.104067</td>\n",
       "      <td>0.514274</td>\n",
       "      <td>-0.002879</td>\n",
       "      <td>0.138664</td>\n",
       "      <td>-0.006673</td>\n",
       "      <td>-0.04772</td>\n",
       "      <td>-0.002571</td>\n",
       "      <td>-0.044136</td>\n",
       "      <td>...</td>\n",
       "      <td>-3.451536</td>\n",
       "      <td>-1.694315</td>\n",
       "      <td>0.599396</td>\n",
       "      <td>-0.282867</td>\n",
       "      <td>-1.022077</td>\n",
       "      <td>-0.158629</td>\n",
       "      <td>-0.464418</td>\n",
       "      <td>-0.463202</td>\n",
       "      <td>-0.25204</td>\n",
       "      <td>-0.249464</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>-0.067792</td>\n",
       "      <td>0.925753</td>\n",
       "      <td>-0.104067</td>\n",
       "      <td>0.514274</td>\n",
       "      <td>-0.002820</td>\n",
       "      <td>-0.011578</td>\n",
       "      <td>-0.006673</td>\n",
       "      <td>-0.04772</td>\n",
       "      <td>-0.002571</td>\n",
       "      <td>-0.044136</td>\n",
       "      <td>...</td>\n",
       "      <td>-3.297085</td>\n",
       "      <td>-1.600011</td>\n",
       "      <td>0.599396</td>\n",
       "      <td>-0.282867</td>\n",
       "      <td>-1.146737</td>\n",
       "      <td>-0.158629</td>\n",
       "      <td>-0.464418</td>\n",
       "      <td>-0.463202</td>\n",
       "      <td>-0.25204</td>\n",
       "      <td>-0.249464</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>-0.067792</td>\n",
       "      <td>0.925753</td>\n",
       "      <td>-0.104067</td>\n",
       "      <td>0.514274</td>\n",
       "      <td>-0.002824</td>\n",
       "      <td>0.014179</td>\n",
       "      <td>-0.006673</td>\n",
       "      <td>-0.04772</td>\n",
       "      <td>-0.002571</td>\n",
       "      <td>-0.044136</td>\n",
       "      <td>...</td>\n",
       "      <td>-3.142633</td>\n",
       "      <td>-1.505707</td>\n",
       "      <td>0.599396</td>\n",
       "      <td>-0.282867</td>\n",
       "      <td>-1.188291</td>\n",
       "      <td>-0.158629</td>\n",
       "      <td>-0.464418</td>\n",
       "      <td>-0.463202</td>\n",
       "      <td>-0.25204</td>\n",
       "      <td>-0.249464</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>-0.067792</td>\n",
       "      <td>0.925753</td>\n",
       "      <td>-0.104067</td>\n",
       "      <td>0.514274</td>\n",
       "      <td>-0.002840</td>\n",
       "      <td>0.014179</td>\n",
       "      <td>-0.006673</td>\n",
       "      <td>-0.04772</td>\n",
       "      <td>-0.002571</td>\n",
       "      <td>-0.044136</td>\n",
       "      <td>...</td>\n",
       "      <td>-2.988182</td>\n",
       "      <td>-1.411403</td>\n",
       "      <td>0.599396</td>\n",
       "      <td>-0.282867</td>\n",
       "      <td>-1.188291</td>\n",
       "      <td>-0.158629</td>\n",
       "      <td>-0.464418</td>\n",
       "      <td>-0.463202</td>\n",
       "      <td>-0.25204</td>\n",
       "      <td>-0.249464</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>-0.067792</td>\n",
       "      <td>0.925753</td>\n",
       "      <td>-0.104067</td>\n",
       "      <td>0.514274</td>\n",
       "      <td>-0.002842</td>\n",
       "      <td>0.035214</td>\n",
       "      <td>-0.006673</td>\n",
       "      <td>-0.04772</td>\n",
       "      <td>-0.002571</td>\n",
       "      <td>-0.044136</td>\n",
       "      <td>...</td>\n",
       "      <td>-2.833731</td>\n",
       "      <td>-1.317100</td>\n",
       "      <td>0.599396</td>\n",
       "      <td>-0.282867</td>\n",
       "      <td>-1.209067</td>\n",
       "      <td>-0.158629</td>\n",
       "      <td>-0.464418</td>\n",
       "      <td>-0.463202</td>\n",
       "      <td>-0.25204</td>\n",
       "      <td>-0.249464</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 41 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   duration  protocol_type   service      flag  src_bytes  dst_bytes  \\\n",
       "0 -0.067792       0.925753 -0.104067  0.514274  -0.002879   0.138664   \n",
       "1 -0.067792       0.925753 -0.104067  0.514274  -0.002820  -0.011578   \n",
       "2 -0.067792       0.925753 -0.104067  0.514274  -0.002824   0.014179   \n",
       "3 -0.067792       0.925753 -0.104067  0.514274  -0.002840   0.014179   \n",
       "4 -0.067792       0.925753 -0.104067  0.514274  -0.002842   0.035214   \n",
       "\n",
       "       land  wrong_fragment    urgent       hot  ...  dst_host_count  \\\n",
       "0 -0.006673        -0.04772 -0.002571 -0.044136  ...       -3.451536   \n",
       "1 -0.006673        -0.04772 -0.002571 -0.044136  ...       -3.297085   \n",
       "2 -0.006673        -0.04772 -0.002571 -0.044136  ...       -3.142633   \n",
       "3 -0.006673        -0.04772 -0.002571 -0.044136  ...       -2.988182   \n",
       "4 -0.006673        -0.04772 -0.002571 -0.044136  ...       -2.833731   \n",
       "\n",
       "   dst_host_srv_count  dst_host_same_srv_rate  dst_host_diff_srv_rate  \\\n",
       "0           -1.694315                0.599396               -0.282867   \n",
       "1           -1.600011                0.599396               -0.282867   \n",
       "2           -1.505707                0.599396               -0.282867   \n",
       "3           -1.411403                0.599396               -0.282867   \n",
       "4           -1.317100                0.599396               -0.282867   \n",
       "\n",
       "   dst_host_same_src_port_rate  dst_host_srv_diff_host_rate  \\\n",
       "0                    -1.022077                    -0.158629   \n",
       "1                    -1.146737                    -0.158629   \n",
       "2                    -1.188291                    -0.158629   \n",
       "3                    -1.188291                    -0.158629   \n",
       "4                    -1.209067                    -0.158629   \n",
       "\n",
       "   dst_host_serror_rate  dst_host_srv_serror_rate  dst_host_rerror_rate  \\\n",
       "0             -0.464418                 -0.463202              -0.25204   \n",
       "1             -0.464418                 -0.463202              -0.25204   \n",
       "2             -0.464418                 -0.463202              -0.25204   \n",
       "3             -0.464418                 -0.463202              -0.25204   \n",
       "4             -0.464418                 -0.463202              -0.25204   \n",
       "\n",
       "   dst_host_srv_rerror_rate  \n",
       "0                 -0.249464  \n",
       "1                 -0.249464  \n",
       "2                 -0.249464  \n",
       "3                 -0.249464  \n",
       "4                 -0.249464  \n",
       "\n",
       "[5 rows x 41 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from numpy import where\n",
    "from matplotlib import pyplot as plt\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "X = df.iloc[:,:41]\n",
    "X = StandardScaler().fit_transform(X)\n",
    "y = df.iloc[:,-1].astype('int')\n",
    "\n",
    "pd.DataFrame(data = X, columns = col_names[:-1]).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 86
    },
    "colab_type": "code",
    "id": "M6URlefYs9Ji",
    "outputId": "575c6ba1-58f5-4f04-fbb1-e0fe1e9dc912"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([26. , 37.9, 46.9, 54.5, 59.2, 63.3, 66.4, 69.3, 72.2, 74.8, 77.4,\n",
       "       79.9, 82.4, 84.8, 87. , 89.2, 91.2, 93.1, 94.9, 95.9, 96.8, 97.7,\n",
       "       98.3, 98.8, 99.2, 99.5, 99.6, 99.7, 99.8, 99.9, 99.9, 99.9, 99.9,\n",
       "       99.9, 99.9, 99.9, 99.9, 99.9, 99.9, 99.9, 99.9])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "covar_matrix = PCA(n_components = 41)\n",
    "covar_matrix.fit(X)\n",
    "variance = covar_matrix.explained_variance_ratio_ #calculate variance ratios\n",
    "\n",
    "var=np.cumsum(np.round(covar_matrix.explained_variance_ratio_, decimals=3)*100)\n",
    "var #cumulative sum of variance explained with [n] features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 312
    },
    "colab_type": "code",
    "id": "rIEsq9aCtWEl",
    "outputId": "e4bb1fa6-2901-4db5-a41f-43a50273f3ab"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x7f40fd6de668>]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEWCAYAAACJ0YulAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3dd3yV5f3/8dcnYYQdNmGEISgCsgyCq9/WLfoVB7W2blFq69evtrWOb+2w1lE7rO2vtdI6cNWBA7VuRKy1BcKeygpIIKwQ9sj4/P6478Q0JuGQ5OQ+yXk/H4/zOPc814f7Qc7n3Nd13ddl7o6IiAhAStQBiIhI4lBSEBGRMkoKIiJSRklBRETKKCmIiEgZJQURESmjpCCSQMwsx8xOq+Vn7DazfnUVkyQXJQVp8MIv0n3hl+EmM3vCzFqX23+mmX1kZrvMbIuZzTCz8yp8xlfNzM3sthjL7GtmJWb2cF3/e2rL3Vu7++qo45CGSUlBGov/dvfWwEggC7gTwMzGAy8CTwI9ga7AT4D/rnD+lUA+cEWM5V0BbAe+YWbNax29SIJQUpBGxd1zgbeAIWZmwG+Bu939r+6+w91L3H2Gu19Xeo6ZtQLGAzcAA8wsq7oyws+9giDxFFIhwYR3HNeb2QozKzCzP4bnYGZHmNkHZrbNzLaa2TNmll5JGd3MbK+ZdSy3bWR4p9PUzPqHdzw7ws95vkL5/cPlsWa2NLxLyjWzWw7zkkqSUVKQRsXMegFjgXnAUUAvYMohTrsQ2E1wR/EOwV1DdU4iuOt4DnihiuPPBUYBQ4GLgTNLQwTuA7oDR4fx/aziye6eB3wYnlvqcuA5dy8E7gbeBdqHsfyhilgfBb7t7m2AIcAHh/i3SZJTUpDG4lUzKwA+BmYA9wKlv7I3HuLcK4Hn3b0YeBa4xMyaHuL4t9x9e3j8WWbWpcIx97t7gbuvA6YDwwHcfaW7v+fuB9x9C8GdzH9VUc5k4DIAM0sFvgk8Fe4rBHoD3d19v7t/XMVnFAKDzKytu29397nV/LtElBSk0Tjf3dPdvbe7f9fd9wHbwn0ZVZ0U3ll8DXgm3DQVSAPOqeL4FsDXS493938B64BvVTg0r9zyXqB1eH5XM3surMrZCTwNdKoivKkEX+h9gdOBHe4+K9x3K8FdxywzW2Jm11TxGRcR3DmtDaubjq/iOBFASUEat0+Bzwm+GKtyOcHfwetmlgesJkgKVVUhXQC0Bf5kZnnhOT2qOb6iewEHjnH3tgR3AlbZge6+n6B66rIwzqfK7ctz9+vcvTvw7TCe/pV8xmx3Hwd0AV4NP0+kSkoK0mh5MC7894Efm9nVZtbWzFLM7CQzmxQediVwF0H1TunrImBs+Ubecq4EHgOOKXf8icAwMzsmhrDaELRf7DCzHsAPD3H8k8BVwHmUSwpm9nUz6xmubidINCXlTzSzZmZ2qZm1C9shdlY8RqQiJQVp1Nx9CvAN4BpgA7AJ+AUw1czGENTL/zH85V36eg1YSVCHXyb8Ej8V+F2F4+cAbxPb3cJdBN1mdwB/B14+RPz/JPgin+vua8vtGgXMNLPdwGvATVU8m3A5kBNWVV0PXBpDjJLETJPsiCQ2M/sAeNbd/xp1LNL4KSmIJDAzGwW8B/Ry911RxyONn6qPRBKUmU0G3gduVkKQ+qI7BRERKRO3OwUze8zMNpvZ4nLbOpjZe+Hj/++ZWftwu5nZ781spZktNLOR8YpLRESqFrc7BTP7CkHXuyfdfUi47QEg393vN7PbgfbufpuZjQVuJHjIZjTwkLuPPlQZnTp18j59+sQlfhGRxmrOnDlb3b1zZfuaxKtQd//IzPpU2DwO+Gq4PJlgbJfbwu1Phv3K/21m6WaW4e7VDk/Qp08fsrOz6zJskaRXXOLsLywOXkUlXywXFrO/sOQ/34u+WD5Q4fh9hV8sHygsoVhV1XXqupP7cdaQbjU618zWVrUvbkmhCl3LfdHnEQxjDMEToZ+XO259uO1LScHMJgITATIzM+MXqUgDtmt/IZt27idvxwE27tgXLO/cz+adB9hX7gu+/PKB8Eu+sLjmX95pTVNIa5pKWpPUL5abBsvNUlLr8F8oTVMrfRC+1uo7KZRxdzezw/7f5+6TgEkAWVlZ+ukhSamkxNmwYx9rt+0lZ9se1m7by5qte1i7bQ8bCvaz+0DRl85p37IpXdqk0bJ58KXdqXUTWjQLlps3Ta3iCz14b17Jl3xwXCotmqbSvGkKzZukEI4QLg1YfSeFTaXVQmaWAWwOt+cSDCFcqme4TSSplZQ4n2/fy7KNu/g0bxfL83ayYvNu1uXv5WDRFyNWNG+SQu+OLendsRUnHNGJjHZpdGuXRre2wXvXtmmkNdUvdTm0+k4KrxEMBXB/+D613Pb/MbPnCBqadxyqPUGksTlQVMyyjbtY8HkByzbuZHneLj7btIu9B4sBMIPeHVpyZNc2nDqwC306taJ3x5b06diKbm3TSEnRr3SpvbglBTP7G0GjciczWw/8lCAZvGBmE4C1fDGByJsEPY9WEgwzfHW84hJJBO7Omq17WLC+gPnrCpi/fgfLNuzkYHHw6799y6YM7NaWi7N6cXRGG47q1pYju7amZbPIanwlScSz99E3q9h1aiXHOsFUiCKN1o69hUxbvol3luTx79X57NhXCEDLZqkM7dmOq0/qw/Ce6QzrlU5GuzTVz0sk9LNDJI4279rPe0s38fbiPP61ahtFJU63tmmcPaQbIzKDBDCgSxtSVfUjCUJJQaSOfZ6/l3eW5PHOkjyy127HHfp0bMmEk/ty1uBuDOuZrvp/SVhKCiK15O6s3Lybtxfn8faSPJZs2AnA0RltufnUIzlrSDeO7Npa1UHSICgpiNSAu7Nw/Q7eXpLHO4vzWL11DwAjM9P5v7EDOXNwN3p3bBVxlCKHT0lB5DAsz9vJq/M28PqCDeQW7CM1xTi+X0euPqkvZwzqSte2aVGHKFIrSgoih5BbsI/X5m9g6vxcluftIjXFOHlAJ753+pGcdnQX0ls2izpEkTqjpCBSiT0Hipg6fwOvzs9l1pp8IKgauuu8wZwzNINOrZtHHKFIfCgpiFQw/dPN3PnKYnIL9nFE51b84PQjGTe8B5kdW0YdmkjcKSmIhLbuPsDdbyxl6vwN9O/SmucmjmF03w7qNSRJRUlBkp678/LcXO7++1L2HCji5tMG8J2vHkHzJhpATpKPkoIktXXb9vKjVxfxjxVbObZ3e+6/8BgGdG0TdVgikVFSkKRUXOI89vEafvPepzRJSeHucYO5dHRvPWksSU9JQZLO6i27+cGLC5i3roDTju7C3ecPIaNdi6jDEkkISgqSNEpKnMc/yeGBt5eT1jSVhy4ZznnDuqshWaQcJQVJCmu37eGHLy5kVk4+pw7swn0XHkMXPX0s8iVKCtKolZQ4z8xcy31vLSfVjF+NH8r4Y3vq7kCkCkoK0mjlFuzj1ikL+OfKbZw8oBO/vGgo3dPVdiBSHSUFaZSmzs/lzlcWU+LOvRccwzeP66W7A5EYRJIUzOwm4DrAgL+4++/MrAPwPNAHyAEudvftUcQnDdfO/YX85NXFvDp/A8f2bs/vvjGcXh00PIVIrFLqu0AzG0KQEI4DhgHnmll/4HZgmrsPAKaF6yIxm52Tz9m/+wevL9zI908/kucnjlFCEDlMUdwpHA3MdPe9AGY2A7gQGAd8NTxmMvAhcFsE8UkDU1hcwu+nreCP01fSs31LXrz+eEZmto86LJEGKYqksBi4x8w6AvuAsUA20NXdN4bH5AFdKzvZzCYCEwEyMzPjH60ktJyte7jp+fks+LyA8cf25GfnDaZ1czWVidRUvf/1uPsyM/sl8C6wB5gPFFc4xs3Mqzh/EjAJICsrq9JjJDlMnZ/LHS8vokmK8cdvjeScoRlRhyTS4EXyk8rdHwUeBTCze4H1wCYzy3D3jWaWAWyOIjZJfAeLSrj3zWU88UkOo/q056FLRqirqUgdiar3URd332xmmQTtCWOAvsCVwP3h+9QoYpPEtmnnfm54Zi7Za7dzzYl9uWPsQJqm1nt/CZFGK6rK15fCNoVC4AZ3LzCz+4EXzGwCsBa4OKLYJEHNXL2NG56dx54DRfz+myM4b1j3qEMSaXSiqj46uZJt24BTIwhHEpy78+jHa7jvreX07tCSZ68bzZGa80AkLtRNQxLangNF3PrSQv6+cCNnDu7Kr78+jDZpTaMOS6TRUlKQhLVqy26uf2oOq7bs5razBnL9f/XTUBUicaakIAnp7cUbueXFhTRrksJTE0ZzYv9OUYckkhSUFCShFBWX8Kt3P+WRGasZ1iudhy8dqe6mIvVISUESxtbdB7jx2Xn8a/U2LhuTyY/PHUTzJqlRhyWSVJQUJCHMXbed7z49l+17D/Lrrw9j/LE9ow5JJCkpKUik3J2nZ67j568voVu7NF7+7gkM7t4u6rBEkpaSgkTmYFEJP351Mc9nf84pA7vw4MXDaddS3U1FoqSkIJHYsbeQ65+ew79Wb+PGU/rzvdOOJCVF3U1FoqakIPUuZ+serpk8m/X5+3jwG8O4YITaD0QShZKC1KtZa/L59lPZADx97WiO69sh4ohEpDwlBak3r8xbz21TFtGzfQseu2oUfTq1ijokEalASUHizt158P0V/H7aCo7v15GHLxtJestmUYclIpVQUpC42l9YzK1TFvLagg18/die3HPBMTRrovkPRBKVkoLEzY69hVz3VDaz1uTzwzOP4rtfPUID2okkOCUFiYsNBfu46vFZrNm6h4cuGc644T2iDklEYqCkIHVu2cadXPX4LPYeKGby1cdxgkY4FWkwlBSkTn2yaivffnIOrZo34YXrj+fojLZRhyQihyGSFj8z+56ZLTGzxWb2NzNLM7O+ZjbTzFaa2fNmpu4pDczU+blc+dgsMtKDMYyUEEQannpPCmbWA/hfIMvdhwCpwCXAL4EH3b0/sB2YUN+xSc24O5M+WsVNz81nZGZ7Xvz2CZoDQaSBiqpvYBOghZk1AVoCG4FTgCnh/snA+RHFJoehuMS56/Wl3Pvmcs4ZmsGTE47ToHYiDVi9tym4e66Z/RpYB+wD3gXmAAXuXhQeth6otLuKmU0EJgJkZmbGP2Cp0v7CYr73/HzeWpzHhJP68qOxR2tQO5EGrsqkYGa7AK9qv7vXqMLYzNoD44C+QAHwInBWrOe7+yRgEkBWVlaV8Ul8Few9yHVPZjM7Zzt3nnM0157cL+qQRKQOVJkU3L0NgJndTVC98xRgwKVARi3KPA1Y4+5bws9/GTgRSDezJuHdQk8gtxZlSByt376Xqx6fzbpte/l/3xrBuUO7Rx2SiNSRWNoUznP3P7n7Lnff6e4PE/zSr6l1wBgza2nB462nAkuB6cD48Jgrgam1KEPiZMmGHVz4p0/YtHM/T044TglBpJGJJSnsMbNLzSzVzFLM7FJgT00LdPeZBA3Kc4FFYQyTgNuA75vZSqAj8GhNy5D4+HjFVr7xyL9JTTGmXH8CY/p1jDokEaljsTQ0fwt4KHw58M9wW425+0+Bn1bYvBo4rjafK/Hzyrz1/PDFhfTv0prHrx5FRjt1ORVpjA6ZFNw9h9pVF0kDN+mjVdz75nKO79eRR644lrZp6nIq0lgdsvrIzI40s2lmtjhcH2pmd8Y/NImau3PfW8vKnkF44ppRSggijVwsbQp/Ae4ACgHcfSHBE8jSiBWXOHe8vIhHZqzm0tGZ/P6SETRvkhp1WCISZ7G0KbR091kVxsEvqupgafj2FxZz83PzeXtJHjee0p/vn36k5kEQSRKxJIWtZnYE4YNsZjae4LkFaYR2Hyhi4pPZfLJqGz8+dxATTuobdUgiUo9iSQo3EHQZHWhmucAa4LK4RiWRyN9zkKsen8WSDTv5zdeHcdGxPaMOSUTqWSy9j1YDp5lZKyDF3XfFPyypbxsK9nH5ozNZv30fj1x2LKcN6hp1SCISgUMmBTNrDlwE9AGalNYtu/vP4xqZ1JuVm3dzxaMz2bW/iCevOY7ReihNJGnFUn00FdhBMJLpgfiGI/Vt4foCrnp8NikGf5s4hiE92kUdkohEKJak0NPdYx7FVBqOT1Zt5brJ2aS3bMbT146mb6dWUYckIhGL5TmFT8zsmLhHIvXq7cV5XPXYbHq0b8FL3zlBCUFEgNjuFE4CrjKzNQTVRwa4uw+Na2QSNy/M/pzbX17IsF7pPH7VKNJbajpsEQnEkhTOjnsUUm8embGK+95azskDOvHI5cfSslm9T74nIgmsupnX2rr7TkBdUBsBd+f+t5fzyIzVnDM0gwcvHk6zJlFN0S0iiaq6n4nPAucS9DpygmqjUg5o/sUGwt35ydQlPPXvtXxrdCZ3jxtCquZSFpFKVDcd57nhu8Y5aMDcnZ+/sZSn/r2Wb3+lH7efPVDjGIlIlWKqUDaz9sAAIK10m7t/FK+gpG6UVhk9/s8crj6xjxKCiBxSLE80XwvcBPQE5gNjgH8Bp8Q3NKmtB99fwSMzVnPZmEx+cu4gJQQROaRYWhpvAkYBa939a8AIoKCmBZrZUWY2v9xrp5ndbGYdzOw9M1sRvrevaRkCf5y+kt9PW8HFWT35+XlDlBBEJCaxJIX97r4fgnGQ3H05cFRNC3T3T919uLsPB44F9gKvALcD09x9ADAtXJca+MtHq/nVO59ywYge3HfhUFLUqCwiMYolKaw3s3TgVeA9M5sKrK2j8k8FVrn7WoJ5oCeH2ycD59dRGUll8ic53PPmMs45JoNfjR+qXkYiclhiGTr7gnDxZ2Y2HWgHvF1H5V8C/C1c7urupZP35AGVjt1sZhOBiQCZmZl1FEbj8LdZ6/jpa0s4Y1BXfnfJcJqk6jkEETk85u6V7zDrUN2J7p5fq4LNmgEbgMHuvsnMCtw9vdz+7e5ebbtCVlaWZ2dn1yaMRuOlOeu5ZcoCvnpkZ/58+bGaT1lEqmRmc9w9q7J91d0pVPbQWqm6eHjtbGCuu28K1zeZWYa7bzSzDGBzLT8/aby2YAM/nLKAE4/oxMOXKSGISM1V9/BavB9a+yZfVB0BvAZcCdwfvk+Nc/mNwluLNvK95+czqk8H/nJFFmlNlRBEpOZifXjtQoLRUh34h7u/WptCw6k9Twe+XW7z/cALZjaBoCH74tqUkQzeX7qJG/82j+G90nnsqlG0aKaEICK1E8vDa38C+vPFr/rrzex0d7+hpoW6+x6gY4Vt2wh6I0kMZny2he8+M5fB3dvy+NWjaNVco52KSO3F8k1yCnC0hy3SZjYZWBLXqKRan6zcysQns+nfpTVPXjOatmlNow5JRBqJWPosrgTK9/3sFW6TCMzOyWfC5Gx6d2zJ09eOpl1LJQQRqTux3Cm0AZaZ2SyCNoXjgGwzew3A3c+LY3xSzrx127n68dlkpKfxzLVj6NBKM6aJSN2KJSn8JO5RyCF9tmkXVz0+m46tm/HstWPo3KZ51CGJSCMUS1LY4u5Ly28ws6+6+4fxCUkqyi3YxxWPzqJ5kxSenjCabu3SDn2SiEgNxNKm8IKZ3WqBFmb2B+C+eAcmgfw9B7n80ZnsOVjE5GuOo1eHllGHJCKNWCxJYTRBQ/MnwGyCoSlOjGdQEth7sIhrnpjN+u37+OsVWRyd0TbqkESkkYslKRQC+4AWBDOvrXH3krhGJRQWl/Cdp+eycH0Bf/jmCEb363jok0REaimWpDCbICmMAk4GvmlmL8Y1qiRXUuLcOmUhMz7bwr0XHMOZg7tFHZKIJIlYGponuHvpUKQbgXFmdnkcY0pq7s69by7jlXm53HLGkVxynIYHF5H6U+WdgpmdAuDu2WZWcXC8PXGNKolN+mg1f/14DVed0IcbvtY/6nBEJMlUV33063LLL1XYd2ccYkl6by7ayH1vLefcoRn85NxBmldZROpddUnBqliubF1qadPO/dzx8iKG9UrnNxcP07zKIhKJ6pKCV7Fc2brUgnvQsHygqJjfXjxMk+SISGSqa2juF45vZOWWCdfjPQFPUnl21jpmfLaFu84bzBGdW0cdjogkseqSwrhyy7+usK/iutTQ2m17uOfvyzipfycuH9M76nBEJMlVNx3njPoMJBkVlzg/eGEBqSnGA+OHqh1BRCKn6boi9Jd/rCZ77XYe/MYwuqe3iDocEZGYnmiuc2aWbmZTzGy5mS0zs+PNrIOZvWdmK8L39lHEVl+WbdzJb9/9jLOHdOP84T2iDkdEBDiMpGBmdTk850PA2+4+EBgGLANuB6a5+wBgWrjeKB0oKuZ7z8+nbYum/OL8IXoeQUQSxiGTgpmdYGZLgeXh+jAz+1NNCzSzdsBXgEcB3P2guxcQNGxPDg+bDJxf0zIS3UPvr2B53i7uv/AYOrbWZDkikjhiuVN4EDgT2Abg7gsIvtRrqi+wBXjczOaZ2V/NrBXQ1d03hsfkAV0rO9nMJppZtpllb9mypRZhRGPO2nz+PGMV38jqxWmDKv0niohEJqbqI3f/vMKm4lqU2QQYCTzs7iMIxlH6j6oid3eqeEDO3Se5e5a7Z3Xu3LkWYdS//YXF/OCFBXRPb8Gd5x4ddTgiIl8SS1L43MxOANzMmprZLQRtADW1Hljv7jPD9SkESWKTmWUAhO+ba1FGQvrzjFXkbNvLAxcNpU1a06jDERH5kliSwvXADUAPIBcYHq7XiLvnESSao8JNpwJLgdeAK8NtVwJTa1pGIlq3bS9/+nAV/z2sOyf07xR1OCIilTrkcwruvhW4tI7LvRF4xsyaAauBqwkS1AtmNgFYC1xcx2VG6udvLKFpivGjsao2EpHEdcikYGaTgZvCHkKEzw/8xt2vqWmh7j4fyKpk16k1/cxENm3ZJt5ftpn/GzuQbu3Sog5HRKRKsVQfDS1NCADuvh0YEb+QGpf9hcX87PUl9O/SmqtP1DiCIpLYYkkKKeWfLjazDmh4jJj9ecYqPs/fx8/PG0zT1EgeIBcRiVksX+6/Af5lZi8SDJs9HrgnrlE1Euu27eXhD1dx7tAMNS6LSIMQS0Pzk2Y2B/hauOlCd18a37Aah5+/sYTUFONH56hxWUQahlirgZYD20uPN7NMd18Xt6gagQ+WB43Ld5w9kIx2GgFVRBqGWHof3Qj8FNhE8CSzETxtPDS+oTVc+wuL+dlrSzmicys1LotIgxLLncJNwFHuvi3ewTQWj8xYzbr8vTxz7WiaNVHjsog0HDENcwHsiHcgjcXn+Xv504crOWdoBieqcVlEGphY7hRWAx+a2d+BA6Ub3f23cYuqAXvw/c9IMeNONS6LSAMUS1JYF76ahS+pwq79hby5aCMXjuypxmURaZBi6ZJ6V30E0hi8tSiP/YUlXDSyZ9ShiIjUSCy9jzoDtwKDgbKBe9z9lDjG1SBNmbuevp1aMTIzPepQRERqJJaG5mcInlPoC9wF5ACz4xhTg/R5/l5mrcnnopE9NOeyiDRYsSSFju7+KFDo7jPC0VF1l1DBS3PXYwYXqOpIRBqwWBqaC8P3jWZ2DrAB6BC/kBoed+flubkc368jPdLVwCwiDVcsSeEXZtYO+AHwB6At8L24RtXAZK/dzrr8vdx06oCoQxERqZVYeh+9ES7u4ItB8aScl+asp2WzVM4a0i3qUEREaqXKpGBmt7r7A2b2B4Kxjv6Du/9vXCNrIPYXFvP3hRs5e0gGrZprmgkRadiq+xZbFr5n13WhZpYD7CIYYK/I3bPCyXueB/oQ9HC6OJzlLaG9sySPXQeKuGhkj6hDERGptSqTgru/bmapwDHufkscyv6au28tt347MM3d7zez28P12+JQbp16aW4uPdJbMKZfx6hDERGptWq7pLp7MXBiPcUyDpgcLk8Gzq+ncmts0879fLxiCxeM6EFKip5NEJGGL5ZK8Plm9hrwIrCndKO7v1yLch1418wceMTdJwFd3X1juD8P6FrZiWY2EZgIkJmZWYsQau+VebmUOFyoqiMRaSRiSQppwDb+84E1B2qTFE5y91wz6wK8Z2bLy+90dw8TxpeECWQSQFZWVqXH1Ad356U56xmZmU6/zq2jCkNEpE7F0iX16rou1N1zw/fNZvYKcBywycwy3H2jmWUAm+u63Lq0KHcHKzbv5p4LhkQdiohInYllQLw0YAJfHhDvmpoUaGatgBR33xUunwH8HHgNuBK4P3yfWpPPry8vzVlPsyYpnDu0e9ShiIjUmVjGPnoK6AacCcwAehJ0J62prsDHZrYAmAX83d3fJkgGp5vZCuC0cD0hHSwq4bUFGzh9UFfatWgadTgiInUmljaF/u7+dTMb5+6TzexZ4B81LdDdVwPDKtm+DTi1pp9bn6Z/upntewsZr8HvRKSRieVOoXRAvAIzGwK0A7rEL6TE99Kc9XRq3ZyTB2gOZhFpXGJJCpPMrD1wJ0G9/1Lgl3GNKoHl7znI9E83c/7w7jRJjeXyiYg0HNWNfdTN3fPc/a/hpo+AfvUTVuJ6d0kehcXO+SP0bIKIND7V/dSdb2bvm9kEM9P8kqF3l26iZ/sWDO7eNupQRETqXHVJoQfwK+Ak4FMzm2pml5hZ0s4is/tAER+v3MoZg7ppyk0RaZSqTAruXuzu74QPr/UCHiMYn2iNmT1TXwEmko8+28LBohLOGFzpCBwiIg1eTC2l7n6QoIF5GbATODqeQSWqd5fk0b5lU7J6t486FBGRuKg2KZhZLzP7oZnNBd4Ijz/P3UfWS3QJpLC4hGnLN3Pa0V3V60hEGq3qeh99QtCu8AJwnbvPqbeoEtDM1fns2l/EGYM15aaINF7VPdF8O/APd49sJNJE8s6SPFo0TdUDayLSqFU389pH9RlIIispcd5buomvHNmJtKapUYcjIhI3qhyPwaLcHeTt3M8Zg1R1JCKNm5JCDN5dmkdqinHq0Uk95JOIJIGYk4KZjTGzt83sQzNL+PmT69K7SzYxum8H0ls2izoUEZG4qjIpmFnFupLvAxcAY4G74xlUIlm9ZTcrNu/mjEF6YE1EGr/qeh/9OXw+4QF33w8UAOOBEoIH2JLCe0s3AXC6uqKKSBKobpiL84F5wBtmdgVwM9Ac6AgkTfXRu0s3MaRHW3qkJ+2QTyKSRKptU3D31wmm4WwHvAJ85kaGJuYAAAy/SURBVO6/d/ct9RFc1Dbv2s/cddvV60hEkkZ1bQrnmdl04G1gMfANYJyZPWdmR9S2YDNLNbN5ZvZGuN7XzGaa2Uoze97MIm/VfX/pZtzRAHgikjSqu1P4BXA2cDHwS3cvcPcfAD8G7qmDsm8iGGCv1C+BB929P7AdmFAHZdTKu0vzyOzQkqO6tok6FBGRelFdUtgBXAhcBGwu3ejuK9z9ktoUamY9gXOAv4brBpwCTAkPmUzE7Ra79hfyycptnDm4q+ZOEJGkUV1SuICgUbkJ8K06Lvd3wK0EPZkIyylw96JwfT3BYHxfYmYTzSzbzLK3bIlf08aMz7ZwsLhEA+CJSFKprvfRVnf/g7v/2d3rrAuqmZ0LbK7pqKvuPsnds9w9q3PnznUV1pe8u2QTHVs1Y2Sm5k4QkeRR3XMK8XIicJ6ZjQXSgLbAQ0C6mTUJ7xZ6ArkRxAbAwaISpi/fzNhjMkhNUdWRiCSPeh/7yN3vcPee7t4HuAT4wN0vBaYTPBwHcCUwtb5jK/Xv1dvYdaBIvY5EJOkk0oB4twHfN7OVBG0Mj0YVyLtL82jZLJUT+2vuBBFJLlFUH5Vx9w+BD8Pl1cBxUcZTavryLXxlQGfNnSAiSSeR7hQSwvrte8kt2MfxR3SMOhQRkXqnpFBBds52ALL6qNeRiCQfJYUKZuXk06Z5EwZ2axt1KCIi9U5JoYLsnHyO7dNeXVFFJCkpKZSzfc9BPtu0m1F9OkQdiohIJJQUysleG7QnKCmISLJSUignOyefZqkpDO3ZLupQREQioaRQzqycfIb2bKfnE0QkaSkphPYdLGZx7g6yVHUkIklMSSE0//MCCoud4/rq+QQRSV5KCqHZOfmYwbGZulMQkeSlpBCanZPPUV3b0K5l06hDERGJjJICUFRcwty129UVVUSSnpICsGzjLvYcLNZ4RyKS9JQUCKqOAI7rqzsFEUluSgoESaFn+xZktGsRdSgiIpFK+qTg7szOUXuCiAgoKZCzbS9bdx9QUhARIYKkYGZpZjbLzBaY2RIzuyvc3tfMZprZSjN73sya1Uc8s9cE7Qmj1MgsIhLJncIB4BR3HwYMB84yszHAL4EH3b0/sB2YUB/BzM7Jp33LpvTv0ro+ihMRSWj1nhQ8sDtcbRq+HDgFmBJunwycXx/xzM7J59jeHTDTpDoiIpG0KZhZqpnNBzYD7wGrgAJ3LwoPWQ/0qOLciWaWbWbZW7ZsqVUcm3ftJ2fbXo13JCISiiQpuHuxuw8HegLHAQMP49xJ7p7l7lmdO3euVRzZOZpUR0SkvEh7H7l7ATAdOB5IN7Mm4a6eQG68y5+1Jp+0pikM7q5JdUREIJreR53NLD1cbgGcDiwjSA7jw8OuBKbGO5bstfmM6NWeZk2SvmeuiAgQzZ1CBjDdzBYCs4H33P0N4Dbg+2a2EugIPBrPIHbtL2Tphp3qiioiUk6TQx9St9x9ITCiku2rCdoX6sW8dQWUOIzSeEciImWStt5kdk4+qSnGiEzdKYiIlErapDBrTT6DMtrSunm93yyJiCSspEwKB4tKmP95gbqiiohUkJRJYVHuDg4UlaiRWUSkgqRMCqWT6mTpTkFE5D8kZYX62CEZdG7dnM5tmkcdiohIQknKpJDZsSWZHVtGHYaISMJJyuojERGpnJKCiIiUUVIQEZEySgoiIlJGSUFERMooKYiISBklBRERKaOkICIiZZQURESkjJKCiIiUUVIQEZEy9Z4UzKyXmU03s6VmtsTMbgq3dzCz98xsRfiuca1FROpZFHcKRcAP3H0QMAa4wcwGAbcD09x9ADAtXBcRkXpU70nB3Te6+9xweRewDOgBjAMmh4dNBs6v79hERJJdpENnm1kfYAQwE+jq7hvDXXlA1yrOmQhMDFd3m9mnNSy+E7C1hufGk+I6PIrr8CVqbIrr8NQmrt5V7TB3r+Fn1o6ZtQZmAPe4+8tmVuDu6eX2b3f3uLUrmFm2u2fF6/NrSnEdHsV1+BI1NsV1eOIVVyS9j8ysKfAS8Iy7vxxu3mRmGeH+DGBzFLGJiCSzKHofGfAosMzdf1tu12vAleHylcDU+o5NRCTZRdGmcCJwObDIzOaH2/4PuB94wcwmAGuBi+Mcx6Q4f35NKa7Do7gOX6LGprgOT1ziiqxNQUREEo+eaBYRkTJKCiIiUiYpk4KZnWVmn5rZSjNLmCenzSzHzBaZ2Xwzy44wjsfMbLOZLS63LfJhSKqI62dmlhtes/lmNjaCuBJy6JZq4or0mplZmpnNMrMFYVx3hdv7mtnM8O/yeTNrliBxPWFma8pdr+H1GVe5+FLNbJ6ZvRGux+d6uXtSvYBUYBXQD2gGLAAGRR1XGFsO0CkB4vgKMBJYXG7bA8Dt4fLtwC8TJK6fAbdEfL0ygJHhchvgM2BQ1NesmrgivWaAAa3D5aYED6+OAV4ALgm3/xn4ToLE9QQwPsr/Y2FM3weeBd4I1+NyvZLxTuE4YKW7r3b3g8BzBENsSMjdPwLyK2yOfBiSKuKKnCfo0C3VxBUpD+wOV5uGLwdOAaaE26O4XlXFFTkz6wmcA/w1XDfidL2SMSn0AD4vt76eBPhDCTnwrpnNCYfzSCQxDUMSkf8xs4Vh9VKko+vWZOiW+lAhLoj4moVVIfMJHlJ9j+DuvcDdi8JDIvm7rBiXu5der3vC6/WgmTWv77iA3wG3AiXhekfidL2SMSkkspPcfSRwNsHosV+JOqDKeHC/mhC/oICHgSOA4cBG4DdRBRIO3fIScLO77yy/L8prVklckV8zdy929+FAT4K794H1HUNlKsZlZkOAOwjiGwV0AG6rz5jM7Fxgs7vPqY/ykjEp5AK9yq33DLdFzt1zw/fNwCsEfyyJIiGHIXH3TeEfcgnwFyK6Zok6dEtlcSXKNQtjKQCmA8cD6WZW+kBtpH+X5eI6K6yGc3c/ADxO/V+vE4HzzCyHoLr7FOAh4nS9kjEpzAYGhC33zYBLCIbYiJSZtTKzNqXLwBnA4urPqlcJOQxJ6Zdu6AIiuGaJOnRLVXFFfc3MrLOZpYfLLYDTCdo7pgPjw8OiuF6VxbW8XGI3gnr7er1e7n6Hu/d09z4E31cfuPulxOt6Rd2iHsULGEvQE2MV8KOo4wlj6kfQE2oBsCTKuIC/EVQrFBLUVU4gqMOcBqwA3gc6JEhcTwGLgIUEX8IZEcR1EkHV0EJgfvgaG/U1qyauSK8ZMBSYF5a/GPhJuL0fMAtYCbwINE+QuD4Ir9di4GnCHkpRvICv8kXvo7hcLw1zISIiZZKx+khERKqgpCAiImWUFEREpIySgoiIlFFSEBGRMkoK0miZ2X1m9jUzO9/M7jjMczuHI1DOM7OTK+z70IJRdktHzRxf1eccooybzaxlTc4ViRclBWnMRgP/Bv4L+Ogwzz0VWOTuI9z9H5Xsv9Tdh4evKZXsj8XNwGElhXJPsIrEhZKCNDpm9iszW0gwVs2/gGuBh83sJ5Uc28fMPggHO5tmZpnhePkPAOPCO4EWMZZ7WTge/3wze8TMUsPtD5tZdoUx+v8X6A5MN7Pp4bbd5T5rvJk9ES4/YWZ/NrOZwAPh0++PhWXNM7Nx4XGDy5W/0MwG1PQaSvLSw2vSKJnZKOAKgjHoP3T3E6s47nVgirtPNrNrgPPc/XwzuwrIcvf/qeScDwnmKtgXbjoV6EKQSC5090Iz+xPwb3d/0sw6uHt+mCSmAf/r7gvDsWyy3H1r+Lm73b11uDweONfdrwqTQydgnLsXm9m9wFJ3fzoclmEWwQio94dlPhMO4ZLq7qUxisREt6LSWI0kGDJkIMG4OlU5HrgwXH6K4Is9Fpe6e9nseGb2TeBYYHYwRA4t+GIAvIvDodCbECSTQQRDKRyOF929OFw+g2CAtFvC9TQgk+Cu6EcWjL3/sruvOMwyRJQUpHEJq36eIBg1citBnb2FY+QfH8dfzgZMdvf/aNA2s77ALcAod98e/upPq+Izyt+2VzxmT4WyLnL3TyscsyysYjoHeNPMvu3uHxzmv0OSnNoUpFFx9/kejIdfOvXkB8CZYYNwZQnhE4KRJwEuBSprVI7FNGC8mXWBsvmZewNtCb7Qd5hZV4K5MkrtIpgms9QmMzvazFIIRi+tyjvAjeGonZjZiPC9H7Da3X9PMGLm0Br+WySJKSlIo2NmnYHtHswXMNDdl1Zz+I3A1WHD9OXATTUpMyzjToKZ8xYSzCaW4e4LCEbeXE4wv+4/y502CXi7tKGZYB7nNwgS1UaqdjfBVJELzWxJuA5wMbA4vCsaAjxZk3+LJDc1NIuISBndKYiISBklBRERKaOkICIiZZQURESkjJKCiIiUUVIQEZEySgoiIlLm/wOOq5I3Qt4kJgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.ylabel('% Variance Explained')\n",
    "plt.xlabel('# of Features')\n",
    "plt.title('PCA Analysis')\n",
    "plt.ylim(20,100.5)\n",
    "plt.style.context('seaborn-whitegrid')\n",
    "\n",
    "\n",
    "plt.plot(var)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "-0S7muQJ5L4R"
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "UFXLESNSOsHP"
   },
   "outputs": [],
   "source": [
    "# A utility method to create a tf.data dataset from a Pandas Dataframe\n",
    "def df_to_dataset(dataframe, shuffle=True, batch_size=32):\n",
    "  dataframe = dataframe.copy()\n",
    "  labels = dataframe.pop('label')\n",
    "  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))\n",
    "  if shuffle:\n",
    "    ds = ds.shuffle(buffer_size=len(dataframe))\n",
    "  ds = ds.batch(batch_size)\n",
    "  return ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "uV3S2Ywbq6Mr"
   },
   "outputs": [],
   "source": [
    "train, test = train_test_split(df, test_size=0.2)\n",
    "train, val = train_test_split(train, test_size=0.2)\n",
    "# print(len(train), 'train examples')\n",
    "# print(len(val), 'validation examples')\n",
    "# print(len(test), 'test examples')\n",
    "\n",
    "# batch size is a hyperparameter that defines the number of samples to work\n",
    "# Adjust the batch_size as per the RAM availability before modeling.\n",
    "batch_size = 50000\n",
    "\n",
    "# creating the tf.data dataset\n",
    "train_ds = df_to_dataset(train, batch_size=batch_size)\n",
    "val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)\n",
    "test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "eTUjgq25q6Qz"
   },
   "outputs": [],
   "source": [
    "# Generating feature layer for keras sequential model.\n",
    "feature_columns = []\n",
    "for header in list(df.columns)[:-1]:\n",
    "  feature_columns.append(feature_column.numeric_column(header))\n",
    "feature_layer = tf.keras.layers.DenseFeatures(feature_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 593
    },
    "colab_type": "code",
    "id": "z4oChvXgq6Us",
    "outputId": "05062973-4ab5-45db-8734-46882f405040"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/5\n",
      "7/7 [==============================] - 13s 2s/step - loss: -3.1743e-07 - accuracy: 0.7919 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00\n",
      "Epoch 2/5\n",
      "7/7 [==============================] - 11s 2s/step - loss: -3.1825e-07 - accuracy: 0.7919 - val_loss: -3.1816e-07 - val_accuracy: 0.7922\n",
      "Epoch 3/5\n",
      "7/7 [==============================] - 11s 2s/step - loss: -3.1768e-07 - accuracy: 0.7919 - val_loss: -3.1816e-07 - val_accuracy: 0.7922\n",
      "Epoch 4/5\n",
      "7/7 [==============================] - 11s 2s/step - loss: -3.1812e-07 - accuracy: 0.7919 - val_loss: -3.1816e-07 - val_accuracy: 0.7922\n",
      "Epoch 5/5\n",
      "7/7 [==============================] - 11s 2s/step - loss: -3.1840e-07 - accuracy: 0.7919 - val_loss: -3.1816e-07 - val_accuracy: 0.7922\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.keras.callbacks.History at 0x7f40f7a86438>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create, compile, and train the model\n",
    "model = tf.keras.Sequential([\n",
    "  feature_layer,\n",
    "  layers.Dense(128, activation='relu'),\n",
    "  layers.Dense(128, activation='relu'),\n",
    "  layers.Dense(1, activation='softmax')\n",
    "])\n",
    "\n",
    "model.compile(optimizer='adam',\n",
    "              loss='kullback_leibler_divergence',\n",
    "              metrics=['accuracy'])\n",
    "\n",
    "model.fit(train_ds,\n",
    "          validation_data=val_ds,\n",
    "          epochs=5,\n",
    "          )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 52
    },
    "colab_type": "code",
    "id": "gdKM5YdarNxc",
    "outputId": "384e4bcf-0e54-4a44-a3b0-df00aaa74b56"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2/2 [==============================] - 3s 1s/step - loss: -3.1550e-07 - accuracy: 0.7941\n",
      "Accuracy on heldout test dataset:  0.79414\n"
     ]
    }
   ],
   "source": [
    "# Testing on the heldout dataset from training samples.\n",
    "loss, accuracy = model.evaluate(test_ds)\n",
    "print(\"Accuracy on heldout test dataset: \", accuracy)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Explaining the model using LIME"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "S-XkECTfRYVn"
   },
   "source": [
    "## Testing Phase on the actual Test samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "2o1Ak3WJrN1-"
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "_-GSxqCnrN6Z"
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "RPF_vGt_q6e9"
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "HmxAJaKcq6JB"
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "OtBu93388X_A"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [
    "l0uviy0bMb0o"
   ],
   "name": "HW5 - Fine-Grained Malware Detection",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
